﻿using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
using System.Xml;

namespace PatternSearch
{
    public static class Corpus
    {
        public static List<Word> LoadXML(FileStream handle, bool cleanCorpus)
        {
            var words = new List<Word>();
            XmlReader reader = XmlReader.Create(handle);
            while (reader.Read())
            {
                if (reader.IsStartElement("w"))
                {
                    string form = String.Empty;
                    string lemma = String.Empty;
                    var tags = new List<string>();
                    bool skip = false;

                    reader.Read();
                    form = reader.Value;

                    reader.Read();
                    if (reader.IsStartElement("lemma"))
                    {
                        reader.Read();
                        lemma = reader.Value;

                        while (reader.Read())
                        {
                            if (reader.IsStartElement("variant"))
                            {
                                var tag = reader.GetAttribute("TAG");
                                if (tag.Contains("K1"))
                                {
                                    skip = true;
                                    break;
                                }
                                tags.Add(tag);
                            }
                            else
                                break;
                        }
                    }
                    if (!skip)
                        words.Add(new Word() { Lemma = lemma, Form = form, TAGS = tags });
                }
            }            
            return words;
        }
    }

    public class Word
    {
        public string Form { get; set; }
        public string Lemma { get; set; }
        public string Semtype { get; set; }
        public List<string> TAGS { get; set; }
    }

}